Supp Material for Luminous Ostracod Vargula tsujii
The R Juypter Notebook serves as a comprehensive repository encompasing most of the scripts and figures relevant to the bioluminescent ostracod V.tsujii analyses outlined in the publication. This notebook includes the following analyses: QC steps, WGCNA, Network Visualization, Network Connectivity, Differential Gene Expression and GO enrichments.
Author: Lisa Yeter Mesrop
#load libraries
library(WGCNA)
library(tidyverse)
library(edgeR)
library(matrixStats)
library(DESeq2)
library(dplyr)
library(readxl)
library(data.table)
library(ggplot2)
library(ggrepel)
library(repr)
library(topGO)
library(reshape2)
library(plyr)
library(scales)
library(readxl)
library(repr)
library(RColorBrewer)
library(pheatmap)
library(flashClust)
library(ggvenn)
#always use the following WGCNA functions
options(stringsAsFactors = FALSE)
enableWGCNAThreads()
allowWGCNAThreads(nThreads = 22)
The gene expression matrix consists of 57 samples and 41,486 genes before quality control (QC).
#read in count matrix
merged_counts <- read.csv("merged_single_network.csv", header = TRUE, row.names=1,
stringsAsFactors = FALSE)
#create the metadata
meta <- data.frame(row.names = colnames(merged_counts))
sample_name = c("upper_lip_1","upper_lip_1","upper_lip_1","upper_lip_1","upper_lip_1","upper_lip_1","upper_lip_1","upper_lip_1",
"upper_lip_1","upper_lip_2", "upper_lip_2", "upper_lip_2", "upper_lip_2", "upper_lip_2",
"eye", "gut", "eye", "gut", "eye", "gut","eye", "gut","eye", "gut","A", "A", "A", "AIF", "AIF", "AIF",
"AII", "AII", "AII", "AIII", "AIII", "AIII", "AIM", "AIM", "AIM", "AIV", "AIV", "AIV","AV", "AV", "AV", "C","C","C","E", "E", "E", "G", "H", "H", "M", "M","M")
meta$sample_name <- sample_name
meta$names <- rownames(meta)
rownames(meta) <- NULL
head(meta)
As a preliminary quality control (QC) measure for WGCNA analysis, the overall similarity between samples and transcripts with low counts was assessed, as these counts often introduce noise in co-expression analyses. A filter was applied to the expression matrix, removing transcripts with fewer than 5 counts in more than 3 samples, given that some sample types included a minimum of 3 biological replicates. The QC analyses utilized functions from the DESeq2 package (Love et al., 2014).
#import count table, meta and sample_name objects into a DESeq2 object
dds_count_table <- DESeqDataSetFromMatrix(countData = merged_counts, colData = meta, design = ~sample_name)
The counts of filtered reads were examined using a bar plot.
#the number of prefiltered counts for each sample
colSums(assay(dds_count_table))
#function from the R package repr to visualize figures in Jupyter Notebook(optional)
options(repr.plot.width=10, repr.plot.height=10)
#extract sample names for barplot
names <- dds_count_table$sample_name
#visualize prefiltered raw counts in a barplot
librarySizes <- colSums(assay(dds_count_table))
par(mar=c(10,5,2,2))
barplot(librarySizes,
names=names,
las=2,
cex.names=.5) +title(main = "Barplot of Count Distributions of Samples", line = -1, outer = TRUE)
Based on the bar plot count distribution in Section 3.1, the following six samples were removed due to very low counts: M1.1.counts.tab, M2.1.counts.tab, M3.1.counts.tab, AV.1.counts.tab, AV.2.counts.tab, AV.4.counts.tab.
#update the merged counts, meta and sample_name objects with the six samples removed
merged_counts_subset <- subset(merged_counts, select= -c(M1.1.counts.tab, M2.1.counts.tab, M3.1.counts.tab, AV.1.counts.tab, AV.2.counts.tab, AV.4.counts.tab))
meta_subset <- data.frame(row.names = colnames(merged_counts_subset))
sample_name_subset = c("upper_lip","upper_lip","upper_lip","upper_lip","upper_lip","upper_lip","upper_lip","upper_lip",
"upper_lip","upper_lip", "upper_lip", "upper_lip", "upper_lip", "upper_lip",
"eye", "gut", "eye", "gut", "eye", "gut","eye", "gut","eye", "gut","A", "A", "A", "AIF", "AIF", "AIF",
"AII", "AII", "AII", "AIII", "AIII", "AIII", "AIM", "AIM", "AIM", "AIV", "AIV", "AIV", "C","C","C","E", "E", "E", "G", "H", "H")
meta_subset$sample_name_subset <- sample_name_subset
meta_subset$names_subset <- rownames(meta_subset)
rownames(meta_subset) <- NULL
nrow(meta_subset)
Perform variance-stabilizing transformation using DEseq2 package (Love et al., 2014). The authors of WGCNA recommend variance-stabilizing transformation as a normalization method before conducting network analyses (Langfelder and Horvath, 2008).
#import updated count table, meta and sample_name objects into a DESeq2 object
dds_count_table_subset<- DESeqDataSetFromMatrix(countData = merged_counts_subset, colData = meta_subset, design = ~sample_name_subset)
#filter the count table
dds_merged_table_prefiltered <- dds_count_table_subset[rowSums(counts(dds_count_table_subset) >=5) >=3,]
nrow(dds_merged_table_prefiltered)
#run DESeq2 function with variance-stabilizing transformation
dds_subset <- DESeq(dds_merged_table_prefiltered, betaPrior = FALSE, parallel = TRUE)
#perform a variance-stabilizing transformation
vsd_subset <- varianceStabilizingTransformation(dds_subset)
#transpose the matrix
sampleDists_subset <- dist(t(assay(vsd_subset)))
#perform heatmap
sampleDistMatrix <- as.matrix(sampleDists_subset)
rownames(sampleDistMatrix) <- paste(colData(dds_merged_table_prefiltered)$sample_name_subset)
colnames(sampleDistMatrix) <- colData(dds_merged_table_prefiltered)$names_subset
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists_subset,
clustering_distance_cols=sampleDists_subset,
col=colors)
#perform PCA
pcaData <- plotPCA(vsd_subset, intgroup="sample_name_subset", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=sample_name_subset, shape=sample_name_subset)) +
geom_point(size=5) +
labs(color = "Sample Types")+ labs(shape = "Sample Types")+
scale_shape_manual(values = c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16))+
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()+theme(
panel.grid.major = element_line(colour = "gray97", size = 1),
panel.grid.minor = element_line(linetype = "dotted"),
panel.background = element_rect(fill = NA),
legend.key = element_rect(fill = "gray100"),
axis.line = element_line(size = 0.5, linetype = "solid"),
panel.border = element_rect(colour = "black", fill = NA, size = 1)
) + theme(
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
legend.title = element_text(size = 16),
legend.text = element_text(size = 12)
)
After completing the prefiltering and normalization steps above, the count matrix now includes 51 samples and 31,097 transcript and is ready for WGCNA. WGCNA (Weighted Gene Co-expression Network Analysis) identifies clusters (modules) of highly correlated genes by constructing a network based on pairwise correlations between gene expression profiles (Langfelder and Horvath, 2008). These modules often correspond to specific biological processes or pathways, indicating that the genes within a module may be part of the same regulatory process. By analyzing the connectivity of genes within each module, WGCNA also helps identify key drivers or hub genes, providing insights into gene regulation and the biological processes they govern. The following scripts are from the WGCNA package (Langfelder and Horvath, 2008).
#use the prefiltered and normalized count matrix from the DEseq2
vsd_subset_matrix <- assay(vsd_subset)
## many functions expect the matrix to be transposed
datExpr <- t(vsd_subset_matrix)
## check rows/cols
nrow(datExpr)
ncol(datExpr)
#ready to start WGCNA Analysis
#run this to check if there are gene outliers
gsg=goodSamplesGenes(datExpr, verbose = 3)
gsg$allOK
# choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
# plot the results:
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
# co-expression similarity and adjacency using assigned softpower
softPower=8
adjacency = adjacency(datExpr, power = softPower)
# calculate the Topological Overlap Matrix (TOM)
# turn adjacency into topological overlap, i.e. translate the adjacency into
# topological overlap matrix and calculate the corresponding dissimilarity
TOM = TOMsimilarity(adjacency, TOMType = "signed", verbose = 5);
dissTOM = 1-TOM;
# call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average");
# plot the resulting clustering tree (dendrogram)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04);
# set the min module size
minModuleSize = 30;
# module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
table(dynamicMods)
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# plot the dendrogram and colors underneath
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
# calculate eigengenes
MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes
# calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);
# cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# plot the result
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
MEDissThres = 0.25
# plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
# call an automatic merging function
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# merged module colors
mergedColors = merge$colors;
# eigengenes of the new merged modules
mergedMEs = merge$newMEs;
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
The functionally demonstrated c-luciferase gene, along with some phylogenetically similar c-luciferase genes, are located in the red module, which we refer to as the Bioluminescent Co-Regulatory Network (BCN).
#determine which column number in the datExpr object that corresponds to c-luciferase
which(colnames(datExpr) == "NODE_10321_length_1972_cov_1770.41_g3092_i1")
#determine the color of the module with c-luciferase
dynamicColors[[389]]
# extract all modules as a table for downstream analyses
SubGeneNames = colnames(datExpr)
for (color in dynamicColors){
module=SubGeneNames[which(dynamicColors==color)]
write.table(module, paste("module_",color, ".txt",sep=""), sep="\t", row.names=FALSE, col.names=FALSE,quote=FALSE)
}
# extract the red module of interest from the dynamicColors object
SubGeneNames = colnames(datExpr)
red=as.data.frame(SubGeneNames[which(dynamicColors=="red")])
names(red)[1] <- "transcript_id"
head(red)
#read in the Trinotate sheet for Vargula tsujii
Trinotate_lym_subset <- read_excel("Trinotate_lym_subset.xlsx")
#incorporate annotation for the red module (the BCN)
red_module_gene_annot <- setDT(Trinotate_lym_subset, key = 'transcript_id')[J(red)]
# red module (the BCN)
head(red_module_gene_annot)
#extract the module that is significantly correlated with the gut sample (from Section 4.8) from the dynamicColors object
purple_gut=as.data.frame(SubGeneNames[which(dynamicColors=="purple")])
names(purple_gut)[1] <- "transcript_id"
purple_gut_gene_annot <- setDT(Trinotate_lym_subset, key = 'transcript_id')[J(purple_gut)]
#extract the module that is correlated with forced lum sample (from Section 4.8) from the dynamicColors object
SubGeneNames = colnames(datExpr)
salmon_forced_lum=as.data.frame(SubGeneNames[which(dynamicColors=="salmon")])
names(salmon_forced_lum)[1] <- "transcript_id"
#match the annotation with each transcript
salmon_forced_lum_module_gene_annot <- setDT(Trinotate_lym_subset, key = 'transcript_id')[J(salmon_forced_lum)]
Identify modules (network) that are significantly associated with samples. The module eigengene provides a representative measure of the gene expression patterns within a module, allowing correlation with these traits to determine the most significant associations (Langfelder and Horvath, 2008). This correlation can then be visualized in a module-to-trait heatmap to determine the most significant associations.The following scripts are from the WGCNA package (Langfelder and Horvath, 2008).
traitData <- read_excel("traits_all_tsujii_updated.xlsx")
traitData <- as.data.frame(traitData)
names(traitData)
head(traitData)
allTraits = traitData
head(allTraits)
names(allTraits)
All_samples = rownames(datExpr)
traitRows = match(All_samples, allTraits$Samples)
traitRows
datTraits = allTraits[traitRows, -1]
rownames(datTraits) = allTraits[traitRows, 1]
# double check that row names agree
table(rownames(datTraits)==rownames(datExpr))
datTraits$Samples_names <- NULL
head(datTraits)
# sample network based on squared Euclidean distance
A=adjacency(t(datExpr),type="distance")
# this calculates the whole network connectivity
k=as.numeric(apply(A,2,sum))-1
# standardized connectivity
Z.k=scale(k)
# designate samples as outlying
# if their Z.k value is below the threshold
thresholdZ.k=-5 # often -2.5
# the color vector indicates outlyingness (red)
outlierColor=ifelse(Z.k<thresholdZ.k,"red","black")
# calculate the cluster tree using flahsClust or hclust
sampleTree = flashClust(as.dist(1-A), method = "average")
# convert traits to a color representation:
# where red indicates high values
traitColors=data.frame(numbers2colors(datTraits,signed=FALSE))
dimnames(traitColors)[[2]]=paste(names(datTraits),"C",sep="")
datColors=data.frame(outlierC=outlierColor,traitColors)
# plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree,groupLabels=names(datColors),
colors=datColors,main="Sample dendrogram and trait heatmap")
# choose a module assignment
moduleColors_Vtsujii=dynamicColors
# define numbers of genes and samples
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
# recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr,moduleColors_Vtsujii)$eigengenes
MEsFemale = orderMEs(MEs0)
modTraitCor = cor(MEsFemale, datTraits, use = "p")
modTraitP = corPvalueStudent(modTraitCor, nSamples)
#color code each association by the correlation value; display correlations and their p-values
textMatrix = paste(signif(modTraitCor, 2), "\n(",
signif(modTraitP, 1), ")", sep = "")
dim(textMatrix) = dim(modTraitCor)
par(mar = c(8, 8.5, 3, 3))
# display the correlation values within a heatmap plot
labeledHeatmap(Matrix = modTraitCor, xLabels = names(datTraits),
yLabels = names(MEsFemale), ySymbols = names(MEsFemale),
colorLabels =FALSE,colors = blueWhiteRed(50),textMatrix=textMatrix,
setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1),
main = paste("Module-trait relationships"))
The Bioluminescent Co-regulatory Network (BCN) has a total of 889 co-expressed transcripts making it challenging to visualize the entire network topology comprehensively. To address this, we used Cytoscape to visualize the network and we subsetted the network to only include genes that are signifcantly upregulated (uniquely) in the bioluminescent upper lip from Section 9.4.
Export the network of interest into edge and node list files for Cytoscape (Shannon et al., 2003).
# select module of interest
module = "red"
# select module probes
genes = colnames(datExpr)
inModule = dynamicColors==module
modProbes = genes[inModule]
# select the corresponding Topological Overlap
modTOM = TOM_visnet_power8[inModule, inModule]
dimnames(modTOM) = list(modProbes, modProbes)
# export the network into edge and node list files Cytoscape
cyt = exportNetworkToCytoscape(modTOM,
edgeFile = paste("CytoscapeInput-edges-", paste(module, collapse="-"), ".txt", sep=""),
nodeFile = paste("CytoscapeInput-nodes-", paste(module, collapse="-"), ".txt", sep=""),
weighted = TRUE,
threshold = 0.02,
nodeNames = modProbes,
altNodeNames = modProbes,
nodeAttr = dynamicColors[inModule])
#determine how many of the significantly upregulated genes expressed in the bioluminescent upper lip (from Section 9.4) are found in the BCN
BCN_DE_unique_Bio_UpperLip <- red %>%
filter(transcript_id %in% unique_genes_bio_upper_lip_info_annot$transcript_id)
#add annotation
BCN_DE_unique_Bio_UpperLip_annot <- left_join(BCN_DE_unique_Bio_UpperLip,Trinotate_lym_subset,by="transcript_id")
GO enrichment analyses was performed using topGO package (Alexa Rahnenfuhrer, 2024). The figures representing the GO enrichments presented here were utilized for analysis, but were not included in the main text of the publication. The GO enrichments were visualized with GO-Figure! package for publication (Reijnders and Waterhouse, 2021). GO-Figure! reference https://github.com/lmesrop/BCN_publication/tree/main/Go-Figure!.
#import the trinotate go sheet from the Trinotate output
geneID2GO <- readMappings(file ="Trinotate_go_lym.txt")
geneNames <- names(geneID2GO)
#save the transcript ids of all the annotated genes under geneNames object
geneNames<- as.character(Trinotate_lym_subset$transcript_id)
#save the transcript
myInterestingGenes= as.character(red[,1])
#subset the genesNames by the transcript IDs in my red module
geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
names(geneList) <- geneNames
head(geneList)
#run the topGO function.
GOdata <- new("topGOdata", ontology = "BP", allGenes = geneList,
annot = annFUN.gene2GO, gene2GO = geneID2GO)
results_go <- runTest(GOdata, algorithm="weight01", statistic="Fisher")
#retrieve the GO enrichment
goEnrichment <- GenTable(GOdata, Fisher = results_go, orderBy = "Fisher", topNodes = 100, numChar=1000)
#graph the GO enrichment
goEnrichment$Fisher <- as.numeric(goEnrichment$Fisher)
goEnrichment <- goEnrichment[goEnrichment$Fisher < 0.05,]
goEnrichment <- goEnrichment[goEnrichment$Significant > 2,]#remove any singletons
goEnrichment
#extract transcript ids that are significantly enriched in the BCN
myterms =goEnrichment$GO.ID
mygenes = genesInTerm(GOdata, myterms)
#extract the transcript ids for each GO term
var=c()
for (i in 1:length(myterms))
{
myterm <- myterms[i]
mygenesforterm <- mygenes[myterm][[1]]
myfactor <- mygenesforterm %in% myInterestingGenes
mygenesforterm2 <- mygenesforterm[myfactor == TRUE]
mygenesforterm2 <- paste(mygenesforterm2, collapse=',')
var[i]=paste(myterm,"genes:",mygenesforterm2)
}
#write.table(var, "BCN_GO_to_mapping.txt", sep="\t", quote=F)
#removed GO enrichment terms that have the same exact set of genes
goEnrichment_subsetted <- read_excel("GO_enrichment_GO_figure_subsetted.xlsx")
#plot the GO enrichment results
ntop = 25
ggdata <- goEnrichment[1:ntop,]
ggdata$Term <- factor(ggdata$Term, levels = rev(ggdata$Term))
plot_BCN_GO_BP <- ggplot(ggdata,
aes(x = Term, y = -log10(Fisher), size = Significant, fill = -log10(Fisher))) +
ylim(-1,21) +
geom_point(shape = 21) +
scale_size(range = c(2.5,12.5)) +
scale_fill_continuous(low = 'royalblue', high = 'red4') +
labs(
title = 'GO enrichment - BP - BCN')+
theme_bw(base_size = 20) +
labs(size= "Number of Genes")+
theme(
legend.position = 'right',
legend.background = element_rect(),
plot.title = element_text(angle = 0, size = 20, face = 'bold', vjust = 1),
plot.subtitle = element_text(angle = 0, size = 10, face = 'bold', vjust = 1),
plot.caption = element_text(angle = 0, size = 12, face = 'bold', vjust = 1),
axis.text.x = element_blank(),
axis.text.y = element_text(angle = 0, size = 13, face = 'bold', vjust = 0.5),
axis.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 8, face = 'bold'),
axis.line = element_line(colour = 'black'),
#Legend
legend.key = element_blank(),
legend.text = element_text(size = 16, face = "bold"))+
coord_flip()
plot_BCN_GO_BP + labs(x = NULL)
#options(repr.plot.width=14, repr.plot.height=8, repr.plot.res = 500)
#run the topGO function
GOdata_MF <- new("topGOdata", ontology = "MF", allGenes = geneList,
annot = annFUN.gene2GO, gene2GO = geneID2GO)
results_go_MF <- runTest(GOdata_MF, algorithm="weight01", statistic="Fisher")
#retrieve the GO enrichment
goEnrichment_MF <- GenTable(GOdata_MF, Fisher = results_go_MF, orderBy = "Fisher", topNodes = 100, numChar=1000)
goEnrichment_MF$Fisher <- as.numeric(goEnrichment_MF$Fisher)
goEnrichment_MF <- goEnrichment_MF[goEnrichment_MF$Fisher < 0.05,]
goEnrichment_MF <- goEnrichment_MF[goEnrichment_MF$Significant > 2,]
goEnrichment_MF
#plot GO MF enchriment plot
ntop = 23
ggdata <- goEnrichment_MF[1:ntop,]
ggdata$Term <- factor(ggdata$Term, levels = rev(ggdata$Term))
GO_MF_BCN_plot <- ggplot(ggdata,
aes(x = Term, y = -log10(Fisher), size = Significant, fill = -log10(Fisher))) +
ylim(-1,21) +
geom_point(shape = 21) +
scale_size(range = c(2.5,12.5)) +
scale_fill_continuous(low = 'royalblue', high = 'red4') +
labs(
title = 'GO Analysis - MF - BCN')+
theme_bw(base_size = 20) +
labs(size= "Number of Genes")+
theme(
legend.position = 'right',
legend.background = element_rect(),
plot.title = element_text(angle = 0, size = 20, face = 'bold', vjust = 1),
plot.subtitle = element_text(angle = 0, size = 10, face = 'bold', vjust = 1),
plot.caption = element_text(angle = 0, size = 12, face = 'bold', vjust = 1),
axis.text.x = element_blank(),
axis.text.y = element_text(angle = 0, size = 13, face = 'bold', vjust = 0.5),
axis.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 8, face = 'bold'),
axis.line = element_line(colour = 'black'),
#Legend
legend.key = element_blank(),
legend.text = element_text(size = 16, face = "bold"))+
coord_flip()
GO_MF_BCN_plot + labs(x=NULL)
#options(repr.plot.width=18, repr.plot.height=8, repr.plot.res = 500)
WGCNA identifies genes integral to a (module) network using a network characteristic called module membership (Langfelder and Horvath, 2008). Module membership represents connectivity of a gene with other genes within a module and is used to define centralised hub genes (Langfelder and Horvath, 2008). Module membership (MM) is a measure ranging from 0 to 1, with higher values indicating strong connectivity within a module and lower values indicating weak connectivity (Langfelder and Horvath, 2008). Genes with similar expression patterns within a module are not only correlated with the module's overall expression (MM) but also tightly interconnected with other genes in the same module (IC) (Langfelder and Horvath, 2008). In other words, genes that are strongly correlated with the overall expression pattern of their modules (high MM) are also highly connected to other genes within those modules (high IC). There is a strong correlation between intramodule connectivity and module membership (Langfelder and Horvath, 2008).
#select the same power used for WGCNA analysis in Section 4.3
ADJ1=abs(cor(datExpr,use="p"))^8
Alldegrees1=intramodularConnectivity(ADJ1, dynamicColors)
head(Alldegrees1)
datME=moduleEigengenes(datExpr,dynamicColors)$eigengenes
signif(cor(datME, use="p"), 2)
datKME =signedKME(datExpr, datME, outputColumnName="MM.")
head(datKME)
nrow(datKME)
#determine the module membership vs intramodular connectivity for the red module (the BCN)
which.color="red"
restrictGenes=dynamicColors==which.color
verboseScatterplot(Alldegrees1$kWithin[ restrictGenes],
(datKME[restrictGenes, paste("MM.", which.color, sep="")])^10,
col=which.color,
xlab="Intramodular Connectivity",
ylab="(Module Membership)^8")
#function from the R package repr to visualize figures in Jupyter Notebook(optional)
options(repr.plot.width=8, repr.plot.height=7)
Identify all co-expressed transcripts within the BCN (red module) that have a high module membership (MM > 0.8). An MM value of 0.8 or higher indicates a strong association with the module eigengene, suggesting that the gene plays a central role in the module's network and can be considered a hub gene (Langfelder and Horvath, 2008).
combined_datKME_ADJ1 = merge(datKME, Alldegrees1, by=0)
head(combined_datKME_ADJ1)
nrow(combined_datKME_ADJ1)
subset_combined_datKME_ADJ1 = dplyr::select(combined_datKME_ADJ1, c("Row.names","MM.red", "kWithin","kTotal"))
head(subset_combined_datKME_ADJ1)
nrow(subset_combined_datKME_ADJ1)
#make the transcript ids rownames again
row.names(subset_combined_datKME_ADJ1) <- subset_combined_datKME_ADJ1$Row.names
subset_combined_datKME_ADJ1[1] <- NULL
head(subset_combined_datKME_ADJ1)
#subset the subset_combined_datKME_ADJ1 to include just transcripts from the red module (the BCN)
top_datKME_ADJ1_.8 <-subset_combined_datKME_ADJ1 %>% dplyr::filter(subset_combined_datKME_ADJ1$MM.red >= 0.79)
head(top_datKME_ADJ1_.8)
#make rowname a column for downstream subsetting
top_datKME_ADJ1_.8$transcript_id <- rownames(top_datKME_ADJ1_.8)
head(top_datKME_ADJ1_.8)
nrow(top_datKME_ADJ1_.8)
#reorganize the column orders
top_datKME_ADJ1_.8_column_reordered <- top_datKME_ADJ1_.8[, c("transcript_id", "MM.red", "kWithin", "kTotal")]
head(top_datKME_ADJ1_.8_column_reordered)
# reorder spreadsheet based on MM.red column from largest to smallest
top_datKME_ADJ1_.8_column_reordered_desc <-top_datKME_ADJ1_.8_column_reordered %>% dplyr::arrange(desc(kWithin))
head(top_datKME_ADJ1_.8_column_reordered_desc)
#annotate the top datKME genes
top_datKME_ADJ1_.8_column_reordered_desc_annot_trinotate <- left_join(top_datKME_ADJ1_.8_column_reordered_desc,Trinotate_lym_subset,by="transcript_id")
head(top_datKME_ADJ1_.8_column_reordered_desc_annot_trinotate)
#write.csv(top_datKME_ADJ1_.8_column_reordered_desc_annot_trinotate, file = "top_datKME_ADJ1_.8_desc_BCN.csv")
Use the results from the DGE analysis in Section Step 9.4 to identify the significantly upregulated genes (expressed uniquely) with the highest module membership (MM > 0.8).
#significantly upregulated genes (expressed uniquely)from section 9.4.4
colnames(unique_genes_bio_upper_lip_info)
#change the gene column name to transcript ID
colnames(unique_genes_bio_upper_lip_info)[1] <- "transcript_id"
#determine how many of the co-expressed genes in the BCN with MM > 0.8 are significantly upregulated in the bioluminescent upper lip
top_datKME_ADJ1_.8_column_reordered_desc_annot_DE_bio_upper_lip <- left_join(top_datKME_ADJ1_.8_column_reordered_desc_annot_trinotate,unique_genes_bio_upper_lip_info,by="transcript_id")
head(top_datKME_ADJ1_.8_column_reordered_desc_annot_DE_bio_upper_lip)
#write.csv(top_datKME_ADJ1_.8_column_reordered_desc_annot_DE_bio_upper_lip , file = "top_datKME_ADJ1_.8_annot_DE_unique_Bio_UpperLip.csv")
Import V.tsujii gene expression matrix which consists of three tissue types - upper lip, compound eye and gut - with five biological replicates for each tissue. Generate the sample name sheet (meta sheet) for downstream DESeq2 analyses.
#read in the gene expression matrix
organ_level_vt <- read.delim("combined_vargula_tsujii.tab", header = TRUE, sep = "\t", quote = "")
head(organ_level_vt)
#remove the extra column that got inserted
row.names(organ_level_vt) <-organ_level_vt$X
organ_level_vt[1]<-NULL
organ_level_vt$X.1 <- NULL
#import the sample sheet
sampleinfo <- read_excel("sampleinfo_vtsujii_DGE.xlsx")
sampleinfo
#generate the DESeq data set
dds_count_table_organ_level <- DESeqDataSetFromMatrix(countData = organ_level_vt, colData = sampleinfo, design = ~group)
#run DESeq2 function and normalization
dds_vtsujii <- DESeq(dds_count_table_organ_level, betaPrior = FALSE, parallel = TRUE)
#perform a variance-stabilizing transformation
dds_vtsujii_vsd <- varianceStabilizingTransformation(dds_vtsujii)
#transpose the matrix
sampleDists_dds_vtsujii_vsd <- dist(t(assay(dds_vtsujii_vsd)))
#plot the heatmap
sampleDistMatrix_dds_vtsujii_vsd <- as.matrix(sampleDists_dds_vtsujii_vsd)
rownames(sampleDistMatrix_dds_vtsujii_vsd) <- paste(colData(dds_count_table_organ_level)$group)
colnames(sampleDistMatrix_dds_vtsujii_vsd) <- colData(dds_count_table_organ_level)$samples
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix_dds_vtsujii_vsd,
clustering_distance_rows=sampleDists_dds_vtsujii_vsd,
clustering_distance_cols=sampleDists_dds_vtsujii_vsd,
col=colors)
options(repr.plot.width=8, repr.plot.height=6, repr.plot.res = 150)
pcaData <- plotPCA(dds_vtsujii_vsd, intgroup="group", returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=group, shape=group)) +
labs(color = "Tissue Types")+ labs(shape = "Tissue Types")+
geom_point(size=5) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
geom_point(size=5) +
theme(panel.grid.major = element_line(colour = "gray97", size = 1), panel.grid.minor = element_line(linetype = "dotted"), panel.background = element_rect(fill = NA),
legend.key = element_rect(fill = "gray100")) + theme(axis.line = element_line(size = 0.5,linetype = "solid")) +
theme(panel.border = element_rect(colour = "black", fill=NA, size=1)) +
coord_fixed() +
scale_color_manual(values = c('#F2C93D','#C97D97','#AC97C9'))+theme(
legend.title = element_text(size = 16),
legend.text = element_text(size = 14),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 12)
)
Differential gene expression analysis was carried out in DESeq2 (Love et al., 2014). For Vargula tsujii , we determined differentially upregulated genes in three tissue types - bioluminescent upper lip, compound eye and gut - using five biological replicates for each tissue. DESeq2 was employed using a p-value < 0.05 and FC > 1.5 for the significance of differentially expressed genes using the Benjamini-Hochberg method to account for false discovery rate (FDR). Pairwise comparisons were done across tissue types (i.e., bioluminescent upper lip to compound eye, bioluminescent upper lip to gut, gut to compound eye). To identify tissue-specific differential expression (i.e. significantly upregulated genes that are uniquely expressed), each tissue was compared to the other two. For example, the expression in the bioluminescent upper lip was determined by comparing it to both the compound eye and the gut tissues.For each pairwise comparison, the reference tissue was specified to determine the significantly upregulated genes in each tissue type (i.e., positive vs negative logfold change).
#run DESeq2 function and normalization
dds_vtsujii <- DESeq(dds_count_table_organ_level)
#set Eyes as reference tissue
dds_vtsujii$group <- relevel(dds_vtsujii$group, ref= "Eyes")
#rerun DESeq command after reference is specified
dds_vtsujii <- DESeq(dds_vtsujii)
#define contrasts, extract results table, and shrink the log2 fold changes
res_tableOE_unshrunken_Gut_Vs_Eye <- results(dds_vtsujii, contrast= c("group", "Gut", "Eyes") , alpha = 0.05)
res_tableOE_Gut_Vs_Eye <- lfcShrink(dds_vtsujii, contrast= c("group", "Gut", "Eyes"), res = res_tableOE_unshrunken_Gut_Vs_Eye )
mcols(res_tableOE_Gut_Vs_Eye, use.names=T)
# set thresholds
# lfc.cutoff value of 0.58 translates to a 1.5 log2 fold change
# padj.cutoff value of 0.05
padj.cutoff <- 0.05
lfc.cutoff <- 0.58
res_tableOE_Gut_Vs_Eye_tb <- res_tableOE_Gut_Vs_Eye %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
#determine all differentially expressed genes
sigOE_Gut_Vs_Eye <- res_tableOE_Gut_Vs_Eye_tb %>%
filter(padj < padj.cutoff & abs(log2FoldChange) > lfc.cutoff)
#extract all genes that are significantly upregulated in the Gut (positive log2 fold change)
sigOE_UPREGULATED_logfold_Gut_vs_Eye <- sigOE_Gut_Vs_Eye %>%
filter(padj < padj.cutoff & log2FoldChange > lfc.cutoff)
head(sigOE_UPREGULATED_logfold_Gut_vs_Eye)
colnames(sigOE_UPREGULATED_logfold_Gut_vs_Eye)[1]<- "transcript_id"
#add annotation
sigOE_UPREGULATED_logfold_Gut_vs_Eye_annot <- left_join(sigOE_UPREGULATED_logfold_Gut_vs_Eye,Trinotate_lym_subset,by="transcript_id")
#genes that are significantly upregulated in the Gut (positive log2fold change)
head(sigOE_UPREGULATED_logfold_Gut_vs_Eye_annot)
#extract all genes that are significantly upregulated in the Compound Eye (negative log2fold change) but that are downregulated in the Gut.
sigOE_DOWNREGULATED_logfold_Gut_vs_Eye <- sigOE_Gut_Vs_Eye %>%
filter(padj < padj.cutoff & log2FoldChange < lfc.cutoff)
#save these two dataframes for downstream analysis in Section 9.4
#genes that are significantly upregulated in the Gut (positive log2fold change)
head(sigOE_UPREGULATED_logfold_Gut_vs_Eye)
#genes that are significantly upregulated in the Compound Eye (negative log2fold change) but that are downregulated in the Gut.
head(sigOE_DOWNREGULATED_logfold_Gut_vs_Eye)
#define contrasts, extract results table, and shrink the log2 fold changes
res_tableOE_unshrunken_Bio_UpperLip_Vs_Eye <- results(dds_vtsujii, contrast= c("group", "Upper_lip", "Eyes") , alpha = 0.05)
res_tableOE_Bio_UpperLip_Vs_Eye <- lfcShrink(dds_vtsujii, contrast= c("group", "Upper_lip", "Eyes"), res = res_tableOE_unshrunken_Bio_UpperLip_Vs_Eye)
mcols(res_tableOE_Bio_UpperLip_Vs_Eye, use.names=T)
res_tableOE_Bio_UpperLip_Vs_Eye_tb <- res_tableOE_Bio_UpperLip_Vs_Eye %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
#determine all differentially expressed genes
sigOE_Bio_UpperLip_Vs_Eye <- res_tableOE_Bio_UpperLip_Vs_Eye_tb %>%
filter(padj < padj.cutoff & abs(log2FoldChange) > lfc.cutoff)
#extract all genes that are significantly upregulated in the bioluminescent upper lip (positive log2 fold change)
sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye <- sigOE_Bio_UpperLip_Vs_Eye %>%
filter(padj < padj.cutoff & log2FoldChange > lfc.cutoff)
head(sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye)
colnames(sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye)[1] <- "transcript_id"
#add annotation
sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye_annot <- left_join(sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye,Trinotate_lym_subset,by="transcript_id")
#genes that are significantly upregulated in Bioluminescent Upper Lip (positive log2fold change)
head(sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye_annot)
#extract all genes that are significantly upregulated in the Compound Eye (negative log2fold change) but that are downregulated in the bioluminescent upper lip
sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye <- sigOE_Bio_UpperLip_Vs_Eye %>%
filter(padj < padj.cutoff & log2FoldChange < lfc.cutoff)
colnames(sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye)[1] <- "transcript_id"
#add annotation
sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye_annot <- left_join(sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye,Trinotate_lym_subset,by="transcript_id")
head(sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye_annot)
#save these two dataframes for downstream analysis in Section 9.4
#genes that are significantly upregulated in Bioluminescent Upper Lip (positive log2fold change)
head(sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye)
#genes that are significantly upregulated in the Compound Eye (negative log2fold change) but that are downregulated in the bioluminescent upper lip
head(sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye)
#now set gut as reference
dds_vtsujii$group <- relevel(dds_vtsujii$group, ref= "Gut")
#rerun DESeq after setting a new reference
dds_vtsujii <- DESeq(dds_vtsujii)
#Define contrasts, extract results table, and shrink the log2 fold changes
res_tableOE_unshrunken_Bio_Upper_Lip_Vs_Gut <- results(dds_vtsujii, contrast= c("group", "Upper_lip", "Gut") , alpha = 0.05)
res_tableOE_Bio_Upper_Lip_Vs_Gut <- lfcShrink(dds_vtsujii, contrast= c("group", "Upper_lip", "Gut"), res = res_tableOE_unshrunken_Bio_Upper_Lip_Vs_Gut )
mcols(res_tableOE_Bio_Upper_Lip_Vs_Gut, use.names=T)
res_tableOE_Bio_Upper_Lip_Vs_Gut_tb <- res_tableOE_Bio_Upper_Lip_Vs_Gut %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
#all differentially expressed genes
sigOE_Bio_Upper_Lip_Vs_Gut <- res_tableOE_Bio_Upper_Lip_Vs_Gut_tb %>%
filter(padj < padj.cutoff & abs(log2FoldChange) > lfc.cutoff)
#extract all genes that are significantly upregulated in the bioluminescent upper lip (positive log2 fold change)
sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut <- sigOE_Bio_Upper_Lip_Vs_Gut %>%
filter(padj < padj.cutoff & log2FoldChange > lfc.cutoff)
head(sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)
colnames(sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)[1] <- "transcript_id"
#add annotation
sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut_annot <- left_join(sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut,Trinotate_lym_subset,by="transcript_id")
head(sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut_annot)
#extract all genes that are significantly upregulated in the Gut (negative log2fold change) but that are downregulated in the bioluminescent upper lip.
sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut <- sigOE_Bio_Upper_Lip_Vs_Gut %>%
filter(padj < padj.cutoff & log2FoldChange < lfc.cutoff)
colnames(sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)[1]<- "transcript_id"
#add annotation
sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut_annot <- left_join(sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut, Trinotate_lym_subset,by="transcript_id")
#save these two dataframes for downstream analysis in Section 9.3
#genes that are significantly upregulated in the bioluminescent upper lip (positive log2 fold change)
head(sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)
#genes that are significantly upregulated in the gut (negative log2fold change) but that are downregulated in the bioluminescent upper lip.
head(sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut_annot)
To identify tissue-specific differential expression (i.e., significantly upregulated genes that are uniquely expressed), each tissue was compared to the other two and tissue-specific genes were extracted from the intersection of the Venn diagram (in Section 9.4.4). For example, the expression in the bioluminescent upper lip was determined by comparing it to both the compound eye and gut tissues.
#create a dataframe with all significantly upregulated genes of the bioluminescent upper lip.
#merge dataframes that have significantly upregulated genes of the bioluminescent upper lip from pairwise comparisons - Bio Upper Lip vs Gut and Bio Upper Lip vs Eye.
head(sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)
head(sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye)
colnames(sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)[1] <- "gene"
colnames(sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye)[1] <- "gene"
#merge
merged_upper_lips_df <- rbind(
sigOE_UPREGULATED_logfold_Bio_Upper_Lip_Vs_Gut,
sigOE_UPREGULATED_logfold_Bio_UpperLip_Vs_Eye
)
head(merged_upper_lips_df)
# The same gene can be found in both Bio Upper Lip vs Gut and Bio Upper Lip vs Eye. Remove gene duplicates while retaining one duplicate.
merged_upper_lips_unique <- merged_upper_lips_df[!duplicated(merged_upper_lips_df$gene), ]
head(merged_upper_lips_unique)
#compound eye
#create a dataframe with all significantly upregulated genes of the compound eye.
#merge dataframes that have significantly upregulated genes of the compound eye from pairwise comparisons - Bio Upper Lip vs Eye and Gut vs Eye.
head(sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye)
head(sigOE_DOWNREGULATED_logfold_Gut_vs_Eye)
colnames(sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye)[1]<- "gene"
#merged eye
merged_Eye_df <- rbind(sigOE_DOWNREGULATED_logfold_Bio_UpperLip_Vs_Eye , sigOE_DOWNREGULATED_logfold_Gut_vs_Eye)
#The same gene can be found in both Bio Upper Lip vs Eye and Gut vs Eye. Remove gene duplicates while retaining one duplicate.
merged_Eye_unique <-merged_Eye_df[!duplicated(merged_Eye_df$gene), ]
head(merged_Eye_unique)
#create a dataframe with all significantly upregulated genes of the gut.
#merge dataframes that have significantly upregulated genes of the gut from pairwise comparisons - Gut vs Eye and Bio Upper Lip vs Gut.
head(sigOE_UPREGULATED_logfold_Gut_vs_Eye)
head(sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)
colnames(sigOE_UPREGULATED_logfold_Gut_vs_Eye)[1] <- "gene"
colnames(sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)[1]<- "gene"
#merge
merged_Gut_df <- rbind(sigOE_UPREGULATED_logfold_Gut_vs_Eye , sigOE_DOWNREGULATED_logfold_Bio_Upper_Lip_Vs_Gut)
#The same gene can be found in both Gut vs Eye and Bio Upper Lip vs Eye. Remove gene duplicates while retaining one duplicate.
merged_Gut_unique <-merged_Gut_df[!duplicated(merged_Gut_df$gene), ]
head(merged_Gut_unique)
#generate a venn diagram to visualize shared significantly upregulated genes across tissue types and extract the genes that are unique to each tissue type.
unique_venn_list <- list(
Bio_Upper_Lip = merged_upper_lips_unique$gene ,
Gut = merged_Gut_unique$gene,
Compound_Eye = merged_Eye_unique$gene
)
ggvenn_unique <- ggvenn(
unique_venn_list,
fill_color = c('#AC97C9','#C97D97', '#F2C93D'),
stroke_size = .7, set_name_size = 6, text_size = 5
)
ggvenn_unique
# Open a PDF device
pdf("Luminous_DGE.pdf", width = 8, height = 6)
ggvenn_unique
dev.off()
#prep dataframes for extraction
Bio_Upper_Lip <- as.data.frame(merged_upper_lips_unique$gene)
colnames(Bio_Upper_Lip)[1]<- "gene"
Gut <- as.data.frame(merged_Gut_unique$gene)
colnames(Gut)[1]<- "gene"
Compound_eye <- as.data.frame(merged_Eye_unique$gene)
colnames(Compound_eye)[1]<- "gene"
# compare and extract unique genes for each tissue type
unique_genes_bio_upper_lip <- anti_join(Bio_Upper_Lip, Gut, by = "gene") %>%
anti_join(Compound_eye, by = "gene")
unique_genes_gut <- anti_join(Gut, Bio_Upper_Lip, by = "gene") %>%
anti_join(Compound_eye, by = "gene")
unique_genes_compound_eye <- anti_join(Compound_eye, Bio_Upper_Lip, by = "gene") %>%
anti_join(Gut, by = "gene")
nrow(unique_genes_bio_upper_lip)
nrow(unique_genes_gut)
nrow(unique_genes_compound_eye)
#add the annotations back to the unique genes in each tissue type by subsetting
unique_genes_bio_upper_lip_info <- merged_upper_lips_unique %>%
filter(gene %in% unique_genes_bio_upper_lip$gene)
nrow(unique_genes_bio_upper_lip_info)
unique_genes_eye_info <- merged_Eye_unique %>%
filter(gene %in% unique_genes_compound_eye$gene)
nrow(unique_genes_eye_info)
unique_genes_gut_info <- merged_Gut_unique %>%
filter(gene %in% unique_genes_gut$gene)
nrow(unique_genes_gut_info)
#add annotations back
colnames(unique_genes_bio_upper_lip_info)[1]<- "transcript_id"
unique_genes_bio_upper_lip_info_annot <- left_join(unique_genes_bio_upper_lip_info,Trinotate_lym_subset,by="transcript_id")
#add annotations back
colnames(unique_genes_eye_info)[1]<- "transcript_id"
unique_genes_eye_info_annot <- left_join(unique_genes_eye_info,Trinotate_lym_subset,by="transcript_id")
#add annotations backs
colnames(unique_genes_gut_info)[1] <- "transcript_id"
unique_genes_gut_info_annot <- left_join(unique_genes_gut_info,Trinotate_lym_subset,by="transcript_id")
#write.csv(unique_genes_bio_upper_lip_info_annot, file = "df_Vtsujii_sigfig_upreg_unique_BioUpperLip.csv")
#write.csv(unique_genes_eye_info_annot, file = "df_Vtsujii_sigfig_upreg_unique_comEye.csv")
#write.csv(unique_genes_gut_info_annot, file = "df_Vtsujii_sigfig_upreg_unique_Gut.csv")
The topGO package was used to perform GO enrichment analysis on significantly upregulated genes (i.e., uniquely expressed) in the bioluminescent upper lip. (Alexa Rahnenfuhrer, 2024). The figures representing the GO enrichments presented here were utilized for analysis, but were not included in the main text of the publication. The GO enrichments were visualized with GO-Figure! package for publication (Reijnders and Waterhouse, 2021). GO-Figure! reference https://github.com/lmesrop/BCN_publication/tree/main/Go-Figure!.
#import the trinotate go sheet from Trinotate output
geneID2GO_bio <- readMappings(file ="Trinotate_go_lym.txt")
#significantly upregulated genes (i.e. expressed uniquely) in the bioluminescent upper lip
head(unique_genes_bio_upper_lip_info)
#save the transcript ids of all the annotated genes under geneNames object
geneNames_bio<- as.character(Trinotate_lym_subset$transcript_id)
#save the transcript
myInterestingGenes_bio= as.character(unique_genes_bio_upper_lip_info$gene)
#subset the genesNames by the transcript IDs in my red module
geneList_bio <- factor(as.integer(geneNames_bio %in% myInterestingGenes_bio))
names(geneList_bio) <- geneNames_bio
head(geneList_bio)
#run the topGO function.
GOdata_bio <- new("topGOdata", ontology = "BP", allGenes = geneList_bio,
annot = annFUN.gene2GO, gene2GO = geneID2GO_bio)
results_go_bio <- runTest(GOdata_bio, algorithm="weight01", statistic="Fisher")
#retrieve the GO enrichment
goEnrichment_bio <- GenTable(GOdata_bio, Fisher = results_go_bio, orderBy = "Fisher", topNodes =100, numChar =1000 )
goEnrichment_bio$Fisher <- as.numeric(goEnrichment_bio$Fisher)
goEnrichment_bio <- goEnrichment_bio[goEnrichment_bio$Fisher < 0.05,]
goEnrichment_bio <- goEnrichment_bio[goEnrichment_bio$Significant > 1,]
#goEnrichment_bio <- goEnrichment_bio[,c("GO.ID","Term", "Annotated","Significant", "Expected", "Fisher")]
goEnrichment_bio
#write.table(goEnrichment_bio, "df_TopGO_Vargula_tsujii_DE_unique_BioUpperLip_BP.tsv",sep = "\t", quote=FALSE)
myterms_bio =goEnrichment_bio$GO.ID
mygenes_bio = genesInTerm(GOdata_bio, myterms_bio)
#extract the transcript ids for each GO term
for (i in 1:length(myterms_bio))
{
myterm_bio <- myterms_bio[i]
mygenesforterm_bio <- mygenes_bio[myterm_bio][[1]]
myfactor_bio <- mygenesforterm_bio %in% myInterestingGenes_bio
mygenesforterm2_bio <- mygenesforterm_bio[myfactor_bio == TRUE]
mygenesforterm2_bio <- paste(mygenesforterm2_bio, collapse=',')
print(paste("Term",myterm_bio,"genes:",mygenesforterm2_bio))
}
ntop = 48
ggdata <- goEnrichment_bio[1:ntop,]
ggdata$Term <- factor(ggdata$Term, levels = rev(ggdata$Term))
plot_Bio_UL_BP <- ggplot(ggdata,
aes(x = Term, y = -log10(Fisher), size = Significant, fill = -log10(Fisher))) +
ylim(1,11) +
geom_point(shape = 21) +
scale_size(range = c(2.5,12.5)) +
scale_fill_continuous(low = 'royalblue', high = 'red4') +
labs(
title = 'GO Analysis - Biological Process')+
theme_bw(base_size = 24) +
labs(size= "Number of Genes")+
theme(
legend.position = 'right',
legend.background = element_rect(),
plot.title = element_text(angle = 0, size = 16, face = 'bold', vjust = 1),
plot.subtitle = element_text(angle = 0, size = 10, face = 'bold', vjust = 1),
plot.caption = element_text(angle = 0, size = 12, face = 'bold', vjust = 1),
axis.text.x = element_blank(),
axis.text.y = element_text(angle = 0, size = 13, face = 'bold', vjust = 0.5),
#axis.title = element_text(size = 12, face = 'bold'),
axis.title.x = element_blank(),
#axis.title.y = element_text(size = 8, face = 'bold'),
axis.line = element_line(colour = 'black'),
#Legend
legend.key = element_blank(),
legend.text = element_text(size = 14, face = "bold"),
title = element_text(size = 14, face = "bold")) +
coord_flip()
#dev.off()
plot_Bio_UL_BP + labs(x = NULL)
options(repr.plot.width=10, repr.plot.height=8, repr.plot.res = 500)
#run the topGO function.
GOdata_bio_MF <- new("topGOdata", ontology = "MF", allGenes = geneList_bio,
annot = annFUN.gene2GO, gene2GO = geneID2GO_bio)
results_go_bio_MF <- runTest(GOdata_bio_MF, algorithm="weight01", statistic="Fisher")
#retrieve the GO enrichment
goEnrichment_bio_MF <- GenTable(GOdata_bio_MF, Fisher = results_go_bio_MF, orderBy = "Fisher", topNodes =100, numChar =1000 )
goEnrichment_bio_MF$Fisher <- as.numeric(goEnrichment_bio_MF$Fisher)
goEnrichment_bio_MF <- goEnrichment_bio_MF[goEnrichment_bio_MF$Fisher < 0.05,]
goEnrichment_bio_MF <- goEnrichment_bio_MF[goEnrichment_bio_MF$Significant > 1,]
goEnrichment_bio_MF <- goEnrichment_bio_MF[,c("GO.ID","Term", "Annotated","Significant", "Expected", "Fisher")]
goEnrichment_bio_MF
ntop = 29
ggdata <- goEnrichment_bio_MF[1:ntop,]
ggdata$Term <- factor(ggdata$Term, levels = rev(ggdata$Term))
plot_Bio_UL_MF <- ggplot(ggdata,
aes(x = Term, y = -log10(Fisher), size = Significant, fill = -log10(Fisher))) +
ylim(1,11) +
geom_point(shape = 21) +
scale_size(range = c(2.5,12.5)) +
scale_fill_continuous(low = 'royalblue', high = 'red4') +
labs(
title = 'GO Analysis - Molecular Function')+
theme_bw(base_size = 24) +
labs(size= "Number of Genes")+
theme(
legend.position = 'right',
legend.background = element_rect(),
plot.title = element_text(angle = 0, size = 16, face = 'bold', vjust = 1),
plot.subtitle = element_text(angle = 0, size = 10, face = 'bold', vjust = 1),
plot.caption = element_text(angle = 0, size = 12, face = 'bold', vjust = 1),
axis.text.x = element_blank(),
axis.text.y = element_text(angle = 0, size = 13, face = 'bold', vjust = 0.5),
#axis.title = element_text(size = 12, face = 'bold'),
axis.title.x = element_blank(),
#axis.title.y = element_text(size = 8, face = 'bold'),
axis.line = element_line(colour = 'black'),
#Legend
legend.key = element_blank(),
legend.text = element_text(size = 14, face = "bold"),
title = element_text(size = 14, face = "bold")) +
coord_flip()
#dev.off()
plot_Bio_UL_MF + labs(x=NULL)
options(repr.plot.width=16, repr.plot.height=8, repr.plot.res = 500)
P. Langfelder, S. Horvath, WGCNA: an R package for weighted correlation network analysis. BMC Bioinformatics 9, 559 (2008)
M. I. Love, W. Huber, S. Anders, Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol. 15, 550 (2014)
Alexa A, Rahnenfuhrer J, topGO: Enrichment Analysis for Gene Ontology (2024)
Shannon, Paul et al., Cytoscape: a software environment for integrated models of biomolecular interaction networks. Genome research vol. 13,11 (2003)